Question 1:
First we will introduce our libraries:
library(dplyr)
library(readr)
library(highcharter)
library(plotly)
library(ggplot2)
library(ROCR)
library(grid)
library(scales)
library(gridExtra)
library(tidyr)
library(ggthemes)Now we will use the plotly package:
earth = readRDS("/Users/shervin/Downloads/week_11/data/historical_web_data_26112015.rds")
plot_ly(earth ,
x = ~earth$Longitude ,
y = ~earth$Latitude ,
z = ~earth$Depth ,
size = ~earth$Magnitude)## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
Question 2:
disaster = read_delim("/Users/shervin/Downloads/week_11/data/disaster.txt", "\t", escape_double = FALSE, trim_ws = TRUE)## Parsed with column specification:
## cols(
## .default = col_integer(),
## FLAG_TSUNAMI = col_character(),
## SECOND = col_character(),
## EQ_PRIMARY = col_double(),
## EQ_MAG_MW = col_double(),
## EQ_MAG_MS = col_double(),
## EQ_MAG_MB = col_character(),
## EQ_MAG_ML = col_double(),
## EQ_MAG_MFA = col_character(),
## EQ_MAG_UNK = col_double(),
## COUNTRY = col_character(),
## STATE = col_character(),
## LOCATION_NAME = col_character(),
## LATITUDE = col_double(),
## LONGITUDE = col_double(),
## MISSING = col_character(),
## DAMAGE_MILLIONS_DOLLARS = col_character(),
## TOTAL_MISSING = col_character(),
## TOTAL_MISSING_DESCRIPTION = col_character(),
## TOTAL_DAMAGE_MILLIONS_DOLLARS = col_character()
## )
## See spec(...) for full column specifications.
disaster %>% select(tsunami = FLAG_TSUNAMI, lat = LATITUDE, lon = LONGITUDE, name = COUNTRY, sequence =YEAR, z = EQ_MAG_ML, sequence = YEAR ) %>% select(lat,lon,z,name,sequence) %>% na.omit()->disaster_temp
hcmap() %>% hc_add_series(data = disaster_temp, type = "mapbubble",minSize = 0, maxSize = 30, color = "blue") %>% hc_plotOptions(series = list(showInLegend = FALSE))iran_earthquake = read_rds("/Users/shervin/Downloads/week_11/data/iran_earthquake.rds")
iran_earthquake %>% filter(Mag>3) ->iran_earthquake_temp
plot <- ggplot(data = iran_earthquake_temp, aes(x=Lat, y =Long)) +geom_point()
plot + stat_density_2d(aes(fill = ..level..), geom = "polygon") +xlab("Longitude") + ylab("Latitude")#we shoud filter it again to get a batter digram:
iran_earthquake_temp %>% filter(Lat<=45, Long<=70) -> iran_earthquake_temp_checked
plot <- ggplot(data = iran_earthquake_temp_checked, aes(x=Lat, y =Long)) +geom_point()
plot + stat_density_2d(aes(fill = ..level..), geom = "polygon") +xlab("Longitude") + ylab("Latitude") ## Question 4:
great_earthquake <- disaster %>% filter(COUNTRY == "IRAN" ,EQ_PRIMARY >= 7) #greatest eartquakes in Iran
#We have to find the cyle of every earthquake with a magnitude greater than 7:
Year = as.data.frame(great_earthquake %>% select(YEAR) %>% filter(YEAR>0))
Year_shift = c(Year[-1,1],0)
Diff = Year_shift - Year
Diff = Diff[-26,1] # The time between every two earthquake with magnitude greater than 7
print(Diff)## [1] 20 93 186 167 127 53 94 125 113 59 110
## [12] 19 20 1 27 5 1 0 5 10 1 2
## [23] 9 7 16 -2017
#Now we will find the exact time difference from the last earthquake:
iran_earthquake %>% filter(Mag>=7) %>% select(OriginTime) %>% arrange(desc(OriginTime)) -> timeFrom this we can see the last great earthquake was more than two years ago and since the cycle of every earthquake is usually less than this the probability that we’ll be experiencing an earthquake in the near future is high.
Question 5:
disaster %>% select(lat =LATITUDE, lon = LONGITUDE, z =TOTAL_DEATHS, name = COUNTRY ) %>%
na.omit() %>% group_by(name) %>% summarise(mean = mean(z), total = sum(z))-> disaster_average
hcmap(data = disaster_average, value = "total", name = "Average total Deaths by Earthquake",
dataLabels = list(enabled = TRUE, format = '{point.name}')) %>%
hc_title(text = 'Total Deaths of Countries by Earthquake',
align = 'center') %>%
hc_subtitle(text = 'displayed by number of peoples', align = 'center')hcmap(data = disaster_average, value = "mean", name = "Average total Deaths by Earthquake",
dataLabels = list(enabled = TRUE, format = '{point.name}')) %>%
hc_title(text = 'Average total Death of Countries by Earthquake',
align = 'center') %>%
hc_subtitle(text = 'displayed by number of peoples', align = 'center')Question 6:
We can make a generalised linear model (glm) using the given columns of Lattitude and Longitude. A fair guess is to take the family of this glm to be poisson. We can check this assumption by plotting the density of the deaths:
disaster %>% select(lat = )## # A tibble: 6,026 x 0
model = glm(data = disaster, DEATHS ~ LATITUDE + LONGITUDE + FOCAL_DEPTH + EQ_PRIMARY,
family = poisson(link = "log"))
summary(model)##
## Call:
## glm(formula = DEATHS ~ LATITUDE + LONGITUDE + FOCAL_DEPTH + EQ_PRIMARY,
## family = poisson(link = "log"), data = disaster)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -465.08 -56.07 -35.51 -17.72 1520.67
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.641e+00 5.209e-03 -315.01 <2e-16 ***
## LATITUDE 2.776e-02 3.245e-05 855.56 <2e-16 ***
## LONGITUDE 2.751e-04 6.960e-06 39.52 <2e-16 ***
## FOCAL_DEPTH -2.904e-02 4.618e-05 -628.84 <2e-16 ***
## EQ_PRIMARY 1.363e+00 7.126e-04 1912.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 16778240 on 1182 degrees of freedom
## Residual deviance: 12197978 on 1178 degrees of freedom
## (4843 observations deleted due to missingness)
## AIC: 12203656
##
## Number of Fisher Scoring iterations: 8
disaster %>% select(DEATHS) %>% na.omit() %>% as.matrix() -> disaster_deaths
hchart(density(disaster_deaths)) %>%
hc_title(text = 'Density Distribution of death') %>%
hc_xAxis(title = list(text = "Number of Deaths")) %>%
hc_yAxis(title = list(text = "Probability Density"))Question 7:
We have to make a glm prediction model and make a prediction based on this model.
# We first have to make the longitude and latitude a round number:
worldwide = read.csv("/Users/shervin/Downloads/week_11/data/worldwide.csv")
worldwide %>% filter(type=="earthquake") %>% select(time,latitude,longitude,mag) -> worldwide_temp
worldwide_temp %>% mutate(lat = round(latitude),
lon = round(longitude)) %>%
select(time,lat,lon,mag) %>%
na.omit()-> worldwide_temp
worldwide_temp$time = as.Date(worldwide_temp$time)
worldwide_temp %>% mutate(Month = (12*as.integer(format(time,"%y"))+as.integer(format(time,"%m")))) ->worldwide_temp
#We define Pish larze as a earthquake with magnitude 4 or less:
worldwide_temp %>% group_by(lat,lon,Month) %>% summarise(max_e = max(mag), count = n(), average= sum(mag)/count) %>%
mutate(Larzeh = as.integer(max_e>4)) %>% na.omit() ->worldwide_temp_larzeh
worldwide_temp_larzeh$Larzeh = as.factor(worldwide_temp_larzeh$Larzeh)
test = worldwide_temp_larzeh %>% sample_frac(0.2)
train = anti_join(worldwide_temp_larzeh,test)## Joining, by = c("lat", "lon", "Month", "max_e", "count", "average", "Larzeh")
model = glm(data = train,
formula = Larzeh ~ count,
family = binomial(link = 'logit'))
predict = predict(model, data = test,
type = "response")Question 8:
We can either user the correlation tests to first check wheter these two are correlated or not. If they were correlated with a high degree we have to check whether we fall into the correlation is causation trap. Therefore we have to use another test to check this assumption. We can use chi-square as a test which can check whether these two come from the same distrubtion, if this test also agrees, we may intrept that they affect each other.
worldwide = read.csv("/Users/shervin/Downloads/week_11/data/worldwide.csv")
worldwide %>% select(depth , mag) %>% na.omit() -> worldwide.selected
q = cor.test(worldwide.selected$depth, worldwide.selected$mag,
alternative = c("two.sided"), method = c("spearman"))
q##
## Spearman's rank correlation rho
##
## data: worldwide.selected$depth and worldwide.selected$mag
## S = 2.8439e+13, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.2344252
chisq.test(worldwide$mag , worldwide$depth)##
## Pearson's Chi-squared test
##
## data: worldwide$mag and worldwide$depth
## X-squared = 3403700, df = 4463700, p-value = 1
The first test confirms that there exists a week correlation between these two ( because p-value lower than our cut-off). So we can conclude that these two are not much related.
Question 9:
We will first find the average number of earthquakes after that to see whether HAARP was real or not We need to make a t-test ,make a null-hypothesis and check whether this assumption can be rejected or not. The saying goes that the HAARP was intended to activate japanese earthquakes, so we can make our null-hypothesis that the after HAARP and before HAARP the number of earthquakes are not different.
worldwide = read_csv("/Users/shervin/Downloads/week_11/data/worldwide.csv")## Parsed with column specification:
## cols(
## .default = col_double(),
## time = col_datetime(format = ""),
## magType = col_character(),
## nst = col_integer(),
## net = col_character(),
## id = col_character(),
## updated = col_datetime(format = ""),
## place = col_character(),
## type = col_character(),
## magNst = col_integer(),
## status = col_character(),
## locationSource = col_character(),
## magSource = col_character()
## )
## See spec(...) for full column specifications.
worldwide %>% filter(type =="earthquake") %>% select(longitude,latitude,time) %>%
mutate(year = as.numeric(format(time, "%Y"))) %>% group_by(longitude,latitude,year) %>% summarise(count = n()) -> worldwide_earthquakes_count
print(worldwide_earthquakes_count)## # A tibble: 60,105 x 4
## # Groups: longitude, latitude [?]
## longitude latitude year count
## <dbl> <dbl> <dbl> <int>
## 1 -180 -24.4 2016 1
## 2 -180 -23.5 2015 1
## 3 -180 -37.2 2016 1
## 4 -180 -24.8 2017 1
## 5 -180 -37.0 2016 1
## 6 -180 -23.8 2016 1
## 7 -180 -24.9 2016 1
## 8 -180 -18.3 2017 1
## 9 -180 -24.0 2017 1
## 10 -180 51.1 2018 1
## # ... with 60,095 more rows
worldwide %>% filter(type =="earthquake") %>% select(place,time) %>%
mutate(year = as.numeric(format(time, "%Y"))) %>% group_by(place,year) %>% summarise(count = n()) -> worldwide_earthquakes_count_place
print(worldwide_earthquakes_count_place)## # A tibble: 43,523 x 3
## # Groups: place [?]
## place year count
## <chr> <dbl> <int>
## 1 0km E of Loiza, Puerto Rico 2016 1
## 2 0km E of Monte Grande, Puerto Rico 2017 1
## 3 0km E of San Andres, Philippines 2017 1
## 4 0km E of San Ramon, California 2016 1
## 5 0km E of The Geysers, California 2017 1
## 6 0km ENE of Balangiga, Philippines 2018 1
## 7 0km ENE of Candabong, Philippines 2017 1
## 8 0km ENE of Estancias de Florida, Puerto Rico 2016 1
## 9 0km ENE of Las Ollas, Puerto Rico 2016 1
## 10 0km ENE of Talisayan, Philippines 2017 1
## # ... with 43,513 more rows
#Haarp:
disaster %>% filter( COUNTRY =="JAPAN", YEAR >=1950) %>% select(YEAR,DEATHS) %>% arrange(desc(YEAR)) %>% na.omit() -> disaster_japan
Haarp_before <- disaster_japan %>% filter( YEAR >1950 , YEAR <1993) %>% select(DEATHS) %>% as.matrix()
Haarp_after <- disaster_japan %>% filter(YEAR > 1993) %>% select(DEATHS) %>% as.matrix()
t.test(Haarp_before,Haarp_after)##
## Welch Two Sample t-test
##
## data: Haarp_before and Haarp_after
## t = -1.2265, df = 14.006, p-value = 0.2402
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1254.7459 341.7459
## sample estimates:
## mean of x mean of y
## 17.5 474.0
The t-test rejects the assumption that HAARP had anything to do with the total deaths due to earthquake in japan therefore we can say that HAARP is not really true.
Question 10:
- What is the sum and average of the damage of earthquake in every country?
disaster %>% select(lat =LATITUDE, lon = LONGITUDE, z =DAMAGE_MILLIONS_DOLLARS, name = COUNTRY ) %>%
na.omit() %>% group_by(name) %>% summarise(mean = mean(as.integer(z)), total = sum(as.integer(z))) %>% select(name,mean,total)-> disaster_average
hcmap(data = disaster_average, value = "total", name = "Average total Deaths by Earthquake",
dataLabels = list(enabled = TRUE, format = '{point.name}')) %>%
hc_title(text = 'Total Deaths of Countries by Earthquake',
align = 'center') %>%
hc_subtitle(text = 'displayed by number of peoples', align = 'center')hcmap(data = disaster_average, value = "mean", name = "Average total Deaths by Earthquake",
dataLabels = list(enabled = TRUE, format = '{point.name}')) %>%
hc_title(text = 'Total Deaths of Countries by Earthquake',
align = 'center') %>%
hc_subtitle(text = 'displayed by number of peoples', align = 'center')- Points on earth which have the most number of earthquaks
worldwide %>% filter(type=="earthquake") %>% select(time,latitude,longitude,mag) ->worldwide_temp
worldwide_temp %>% mutate(lat = round(latitude),
lon = round(longitude)) ->worldwide_temp
worldwide_temp %>% group_by(lat,lon) %>%
summarise(z = n()) %>%
arrange(desc(z))-> worldwide_temp
hcmap() %>% hc_add_series(data = worldwide_temp[1:20,], type = "mapbubble",minSize = 0, maxSize = 30, color = "blue") %>% hc_plotOptions(series = list(showInLegend = FALSE))worldwide %>% filter(type=="earthquake") %>% select(time,latitude,longitude,mag) ->worldwide_temp
worldwide %>% mutate( deep = as.integer(log(depth)>2), high_mag = as.integer(mag>5)) %>% select(deep,high_mag) -> worldwide_depth
t.test(worldwide_depth$deep~worldwide_depth$high_mag)##
## Welch Two Sample t-test
##
## data: worldwide_depth$deep by worldwide_depth$high_mag
## t = -33.508, df = 4432.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1201130 -0.1068347
## sample estimates:
## mean in group 0 mean in group 1
## 0.8580998 0.9715736
We see that our null-hypothesis is wrong and depth affects magnitude we can find the correlation with spearman:
cor.test(worldwide_depth$deep,worldwide_depth$high_mag,
alternative = c("two.sided"), method = c("spearman"))## Warning in cor.test.default(worldwide_depth$deep, worldwide_depth
## $high_mag, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: worldwide_depth$deep and worldwide_depth$high_mag
## S = 3.416e+13, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.07131091
Which states that these two are corrleated.